Carlos Manchini
2024-02-28
Target e Médias das Variáveis Preditoras
| y | Contagem | Prop | VAR_1 | VAR_2 | VAR_3 | VAR_4 | VAR_5 | VAR_6 | VAR_7 | VAR_8 | VAR_9 | VAR_10 | VAR_11 | VAR_12 | VAR_13 | VAR_14 | VAR_15 | VAR_16 | VAR_17 | VAR_18 | VAR_19 | VAR_20 | VAR_21 | VAR_22 | VAR_23 | VAR_24 | VAR_25 | VAR_26 | VAR_27 | VAR_28 | VAR_29 | VAR_30 | VAR_31 | VAR_32 | VAR_33 | VAR_34 | VAR_35 | VAR_36 | VAR_37 | VAR_38 | VAR_39 | VAR_40 | VAR_41 | VAR_42 | VAR_43 | VAR_44 | VAR_45 | VAR_46 | VAR_47 | VAR_48 | VAR_49 | VAR_50 | VAR_51 | VAR_52 | VAR_53 | VAR_54 | VAR_55 | VAR_56 | VAR_57 | VAR_58 | VAR_59 | VAR_60 | VAR_61 | VAR_62 | VAR_63 | VAR_64 | VAR_65 | VAR_66 | VAR_67 | VAR_68 | VAR_69 | VAR_70 | VAR_71 | VAR_72 | VAR_73 | VAR_74 | VAR_75 | VAR_76 | VAR_77 | VAR_78 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 7610 | 70.87 | 22.9 | 0.2 | 0.2 | 0.5 | 93.5 | 1664.2 | 67.4 | 69.5 | 1107.4 | 462.9 | 69.1 | 462.3 | 90.6 | 630.5 | 66.2 | 89.4 | 2492.5 | 58.3 | 9.2 | 10.1 | 1289.1 | 12.0 | 128.6 | 380.7 | 2.7 | 380.7 | 428.4 | 3.3 | 6.3 | 1681.2 | 14.9 | 0.1 | 2.9 | 289.3 | 285.5 | 3.5 | 4.4 | 274.5 | 326.0 | 3.6 | 3.6 | 8.7 | 18.9 | 4.5 | 373.4 | 10.1 | 74.5 | 1.4 | 3.5 | 200.5 | 369.5 | 268.7 | 2359.3 | 1810.5 | 199.2 | 202.4 | 47.1 | 333.2 | 315.0 | 0.0 | 3.2 | 1262.4 | 194.0 | 0.6 | 1330.3 | 17.8 | -222.6 | -0.8 | 531.4 | 4.5 | 0.4 | 176.1 | 0.4 | 0.5 | 629.4 | 480.6 | 64.8 | 0.5 |
| 1 | 3128 | 29.13 | 57.1 | 0.8 | 0.5 | 0.8 | 90.2 | 1064.0 | 68.9 | 74.5 | 1276.0 | 448.6 | 69.8 | 437.7 | 88.1 | 515.3 | 66.2 | 85.1 | 2044.1 | 52.5 | 4.5 | 11.2 | 1128.2 | 6.2 | 126.0 | 367.1 | 1.4 | 266.1 | 377.0 | 1.9 | 12.0 | 1796.7 | 9.0 | 0.1 | 1.7 | 322.6 | 296.8 | 4.9 | 10.6 | 286.4 | 314.5 | 2.3 | 5.2 | 15.7 | 17.6 | 3.1 | 363.5 | 18.5 | 85.2 | 8.2 | 5.4 | 201.4 | 339.5 | 269.7 | 2302.5 | 1775.9 | 198.4 | 203.3 | 43.3 | 318.3 | 290.6 | -0.1 | 4.3 | 1221.7 | 195.5 | 0.7 | 925.8 | 17.2 | -183.5 | 5.0 | 517.6 | 6.2 | 0.6 | 131.3 | 0.6 | 0.9 | 655.1 | 464.5 | 65.5 | 0.9 |
set.seed(369)
index <- caret::createDataPartition(data$y, p = 0.7, list = FALSE)
train0 <- data[index, ] ; cat(train0$y %>% table %>% prop.table*100)70.97246 29.02754
70.63024 29.36976
library(plotly)
ts_y <- ggplot(data, aes(x = AnoMes, fill = factor(y))) +
geom_bar(position = "dodge", width = 25) +
scale_fill_manual(values = c("darkgrey", "springgreen3"), labels = c("0", "1")) +
labs(x = "Data", y = "Número de Casos", title = "Série Temporal - Target - 2014", fill = "Target") +
scale_x_date(date_labels = "%b", date_breaks = "1 month")
ggplotly(ts_y, height = 400, align="center")Proporção de Valores Ausentes e Information Value (IV)
# Gráfico VAR_1
plot_VAR_1 <- ggplot(aggregate(VAR_1 ~ AnoMes, data = data, FUN = sum), aes(x = AnoMes, y = VAR_1)) +
geom_line(color="springgreen3", size=1.1) + labs(x = "", y = "VAR_1", title = "Comportamento das Variáveis VAR_1:3") +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")
pl1 <- ggplotly(plot_VAR_1)
# Gráfico VAR_2
plot_VAR_2 <- ggplot(aggregate(VAR_2 ~ AnoMes, data = data, FUN = sum), aes(x = AnoMes, y = VAR_2)) +
geom_line(color="springgreen3", size=1.1) + labs(x = "", y = "VAR_2") +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")
pl2 <- ggplotly(plot_VAR_2)
# Gráfico VAR_3
plot_VAR_3 <- ggplot(aggregate(VAR_3 ~ AnoMes, data = data, FUN = sum), aes(x = AnoMes, y = VAR_3)) +
geom_line(color="springgreen3", size=1.1) + labs(x = "", y = "VAR_3") +
scale_x_date(date_labels = "%b %Y", date_breaks = "1 month")
pl3 <- ggplotly(plot_VAR_3)
subplot(pl1, pl2, pl3, nrows = 3, shareY = FALSE, margin = .05, titleY = T, widths = .99, heights = c(.33,.33,.33))bestiv <- IV$Summary[IV$Summary$IV >= 0.1, ]$Variable
train <- train0 %>% subset(select = c("id","y", bestiv))
test <- test0 %>% subset(select = c("y",bestiv))
corrplot::corrplot(cor(train %>% select(id, y, bestiv[1:20]), use = "pairwise.complete.obs"),
method = "circle",
type = "upper",
diag = TRUE,
tl.col = "black")library(reshape2) ; library(plyr)
nvars <- length(dendro[dendro$height<0.5]) #critério agregacao
part_init <- cutreevar(dendro, nvars)$cluster
kmeans <- kmeansvar(X.quanti = as.data.frame(train[,!(names(train) %in% c("id","y"))]),init = part_init)
clusters <- cbind.data.frame(melt(kmeans$cluster), row.names(melt(kmeans$cluster)))
names(clusters) <- c("Cluster", "Variable")
clusters <- join(clusters, IV$Summary, by="Variable", type="left")
clusters <- clusters[order(clusters$Cluster),]
clusters$Rank <- stats::ave(-clusters$AdjIV, clusters$Cluster, FUN=rank)
VARIAVEISclust <- subset(clusters, Rank==1) #%in% 1:2
DT::datatable(clusters %>% select(Cluster, Variable, IV, Rank),
width = 559,
options = list(pageLength = 15, columnDefs = list(list(className = 'dt-center', targets = 1:3))))bestiv_clust <- VARIAVEISclust$Variable
train2 <- train %>% select("y", all_of(bestiv_clust))
test2 <- test %>% select("y", all_of(bestiv_clust))
corrplot::corrplot(cor(train2, use = "pairwise.complete.obs"),
method = "circle",
type = "upper",
diag = TRUE,
tl.col = "black",
addCoef.col = "black",
number.cex = .6){breaks <- list(
VAR_1=c("missing%,%5", "30", "80", "140"),
VAR_6=c("missing%,%300", "800", "1400"),
VAR_20=c("5", "9", "11"),
VAR_47=c("missing","20"),
VAR_32=c("0.09063413815", "0.1338329127"),
VAR_19=c("missing%,%1", "13", "23" ),
VAR_18=c("missing","15", "25", "65"),
VAR_17=c("missing","1000", "1400", "3200", "4800"),
VAR_43=c("missing", "8", "28"),
VAR_65=c("missing","500", "1100", "1900"),
VAR_37=c("missing","2", "6", "20"),
VAR_77=c("missing", "10"),
VAR_57=c("30", "41", "46", "53"),
VAR_14=c("missing","350", "650", "1100"),
VAR_78=c("missing","1")
)}
bin_adj <- woebin(train2, y="y", positive=1, breaks_list = breaks)
bin_adj_test <- woebin(test2, y="y", positive=1, breaks_list = breaks)
plotsbin_adj <- woebin_plot(bin_adj, line_color='blue', bar_color=c('springgreen3','gray'))
plotsbin_adjwoe <- woebin_plot(bin_adj, line_value = "woe", line_color='blue', bar_color=c('springgreen3','gray'))
ggplotly(plotsbin_adjwoe$VAR_1)| variable | bin | count | count_distr | neg | pos | posprob | woe | bin_iv |
|---|---|---|---|---|---|---|---|---|
| VAR_1 | [-Inf,5)%,%missing | 3048 | 0.4054809 | 2449 | 599 | 0.1965223 | -0.5141265 | 0.0948693 |
| VAR_1 | [5,30) | 2219 | 0.2951976 | 1703 | 516 | 0.2325372 | -0.2999929 | 0.0248191 |
| VAR_1 | [30,80) | 1268 | 0.1686843 | 838 | 430 | 0.3391167 | 0.2268141 | 0.0090705 |
| VAR_1 | [80,140) | 487 | 0.0647865 | 236 | 251 | 0.5154004 | 0.9556681 | 0.0676574 |
| VAR_1 | [140, Inf) | 495 | 0.0658507 | 109 | 386 | 0.7797980 | 2.1585365 | 0.3377480 |
| variable | bin | count | count_distr | neg | pos | posprob | woe | bin_iv |
|---|---|---|---|---|---|---|---|---|
| VAR_37 | missing | 4260 | 0.5667154 | 2893 | 1367 | 0.3208920 | 0.1443715 | 0.0121592 |
| VAR_37 | [-Inf,2) | 1298 | 0.1726753 | 1077 | 221 | 0.1702619 | -0.6897250 | 0.0693803 |
| VAR_37 | [2,6) | 600 | 0.0798191 | 459 | 141 | 0.2350000 | -0.2862433 | 0.0061302 |
| VAR_37 | [6,20) | 827 | 0.1100173 | 582 | 245 | 0.2962515 | 0.0288348 | 0.0000920 |
| VAR_37 | [20, Inf) | 532 | 0.0707729 | 324 | 208 | 0.3909774 | 0.4508416 | 0.0155966 |
| Estimate | Std. Error | z value | Pr(>|z|) | OddsRatio | Chance | |
|---|---|---|---|---|---|---|
| (Intercept) | -0.889 | 0.029 | -30.753 | 0.000 | NA | NA |
| VAR_1_woe | 1.021 | 0.042 | 24.298 | 0.000 | 2.775969 | 177.59693 |
| VAR_6_woe | 0.282 | 0.136 | 2.076 | 0.038 | 1.325779 | 32.57787 |
| VAR_20_woe | 1.310 | 0.079 | 16.616 | 0.000 | 3.706174 | 270.61737 |
| VAR_47_woe | 0.223 | 0.119 | 1.875 | 0.061 | 1.249821 | 24.98206 |
| VAR_19_woe | 0.286 | 0.133 | 2.149 | 0.032 | 1.331093 | 33.10925 |
| VAR_17_woe | 0.377 | 0.094 | 4.019 | 0.000 | 1.457904 | 45.79043 |
| VAR_37_woe | 0.503 | 0.096 | 5.249 | 0.000 | 1.653675 | 65.36749 |
| VAR_77_woe | 0.570 | 0.117 | 4.871 | 0.000 | 1.768267 | 76.82671 |
| VAR_57_woe | 0.692 | 0.087 | 7.915 | 0.000 | 1.997707 | 99.77070 |
| Dados | MSE | RMSE | LogLoss | R2 | KS | AUC | Gini |
|---|---|---|---|---|---|---|---|
| Treino | 0.1644 | 0.4054 | 0.4988 | 0.2021 | 0.3835 | 0.7644 | 0.5288 |
| Teste | 0.1700 | 0.4123 | 0.5131 | 0.1806 | 0.3909 | 0.7567 | 0.5134 |
Validação Cruzada
| datset | train | validation |
|---|---|---|
| 1 | 0.3793175 | 0.4577518 |
| 10 | 0.3880372 | 0.3726454 |
| 2 | 0.3835985 | 0.4069363 |
| 3 | 0.3820137 | 0.4271099 |
| 4 | 0.3871181 | 0.3892653 |
| 5 | 0.3881241 | 0.3948260 |
| 6 | 0.3796413 | 0.4647385 |
| 7 | 0.3843862 | 0.3991380 |
| 8 | 0.3947458 | 0.3351765 |
| 9 | 0.3865076 | 0.3680718 |
| datset | train | validation |
|---|---|---|
| 1 | 0.7606399 | 0.7965824 |
| 10 | 0.7649281 | 0.7601584 |
| 2 | 0.7624011 | 0.7820006 |
| 3 | 0.7626216 | 0.7819101 |
| 4 | 0.7648676 | 0.7601895 |
| 5 | 0.7660196 | 0.7498796 |
| 6 | 0.7623814 | 0.7871096 |
| 7 | 0.7662695 | 0.7457885 |
| 8 | 0.7698920 | 0.7113235 |
| 9 | 0.7658792 | 0.7521899 |
| datset | train | validation |
|---|---|---|
| 1 | 0.1647146 | 0.1614745 |
| 10 | 0.1645354 | 0.1632099 |
| 2 | 0.1646252 | 0.1620947 |
| 3 | 0.1656327 | 0.1522315 |
| 4 | 0.1645093 | 0.1632295 |
| 5 | 0.1642816 | 0.1651660 |
| 6 | 0.1653753 | 0.1550888 |
| 7 | 0.1644799 | 0.1635406 |
| 8 | 0.1623144 | 0.1849546 |
| 9 | 0.1627372 | 0.1791463 |
Teste de Goldfeld-Quandt (Homocedasticidade)
Goldfeld-Quandt test
data: logstep
GQ = 1.0007, df1 = 3749, df2 = 3748, p-value = 0.4919
alternative hypothesis: variance increases from segment 1 to 2
VIF (Variance Inflation Factor)
variable gvif
1: VAR_1_woe 1.062664
2: VAR_6_woe 3.883454
3: VAR_20_woe 1.049868
4: VAR_47_woe 2.232550
5: VAR_19_woe 2.785666
6: VAR_17_woe 1.178274
7: VAR_37_woe 1.083164
8: VAR_77_woe 1.769231
9: VAR_57_woe 1.057920
Matriz de Confusão
Confusion Matrix and Statistics
Reference
Prediction 0 1
0 2156 681
1 119 265
Accuracy : 0.7516
95% CI : (0.7363, 0.7665)
No Information Rate : 0.7063
P-Value [Acc > NIR] : 5.366e-09
Kappa : 0.2757
Mcnemar's Test P-Value : < 2.2e-16
Sensitivity : 0.9477
Specificity : 0.2801
Pos Pred Value : 0.7600
Neg Pred Value : 0.6901
Prevalence : 0.7063
Detection Rate : 0.6694
Detection Prevalence : 0.8808
Balanced Accuracy : 0.6139
'Positive' Class : 0
card <- scorecard(bin_adj, logstep, points0 = 600, odds0 = 1/20, pdo=20)
exp_eta <- exp(predict(logstep))
factor <- 20/log(2)
offset <- 600-factor*log(20)
data.frame(odds=exp_eta,
prob_calc=round(exp_eta/(exp_eta+1),2),
prob_pred=round(train_pred,2),
scorecard = scorecard_ply(train2, card, print_step = 0),
score_calc = round(offset-factor*predict(logstep),2)) %>%
DT::datatable(width = 599,
options = list(pageLength = 8, searching = FALSE, lengthChange = FALSE))
–>
–>
–>
–>
